pacman::p_load(
tidyverse,
here,
RColorBrewer,
lubridate,
scales,
GGally,
stats,
corrplot,
mice,
VIM
)
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))
Rows: 327346 Columns: 38
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): origin, origin_name, dest, dest_name, carrier, carrier_name, tailnum, manufacturer, model, engine, type
dbl (22): dep_delay, arr_delay, air_time, distance, flight, engines, seats, aircraft_age, lat, lon, alt, wind_di...
dttm (5): dep_time, sched_dep_time, arr_time, sched_arr_time, time_hour
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000")
summary(flights_dt)
dep_time sched_dep_time dep_delay origin origin_name
Min. :2013-01-01 05:17:00 Min. :2013-01-01 05:15:00 Min. : -43.00 Length:327346 Length:327346
1st Qu.:2013-04-05 05:56:00 1st Qu.:2013-04-05 06:00:00 1st Qu.: -5.00 Class :character Class :character
Median :2013-07-04 09:52:30 Median :2013-07-04 09:54:30 Median : -2.00 Mode :character Mode :character
Mean :2013-07-03 18:10:05 Mean :2013-07-03 18:02:48 Mean : 12.56
3rd Qu.:2013-10-01 18:12:45 3rd Qu.:2013-10-01 18:14:45 3rd Qu.: 11.00
Max. :2013-12-31 23:56:00 Max. :2013-12-31 23:59:00 Max. :1301.00
arr_time sched_arr_time arr_delay dest
Min. :2013-01-01 00:03:00 Min. :2013-01-01 00:05:00 Min. : -86.000 Length:327346
1st Qu.:2013-04-05 01:03:00 1st Qu.:2013-04-05 03:43:30 1st Qu.: -17.000 Class :character
Median :2013-07-04 11:36:30 Median :2013-07-04 11:55:30 Median : -5.000 Mode :character
Mean :2013-07-03 19:41:02 Mean :2013-07-03 19:59:24 Mean : 6.895
3rd Qu.:2013-10-01 20:07:00 3rd Qu.:2013-10-01 20:20:30 3rd Qu.: 14.000
Max. :2014-01-01 00:00:00 Max. :2013-12-31 23:59:00 Max. :1272.000
dest_name air_time distance flight carrier carrier_name
Length:327346 Min. : 20.0 Min. : 80 Min. : 1 Length:327346 Length:327346
Class :character 1st Qu.: 82.0 1st Qu.: 509 1st Qu.: 544 Class :character Class :character
Mode :character Median :129.0 Median : 888 Median :1467 Mode :character Mode :character
Mean :150.7 Mean :1048 Mean :1943
3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:3412
Max. :695.0 Max. :4983 Max. :8500
tailnum manufacturer model engines seats engine
Length:327346 Length:327346 Length:327346 Min. :1.00 Min. : 2.0 Length:327346
Class :character Class :character Class :character 1st Qu.:2.00 1st Qu.: 55.0 Class :character
Mode :character Mode :character Mode :character Median :2.00 Median :149.0 Mode :character
Mean :1.99 Mean :137.5
3rd Qu.:2.00 3rd Qu.:189.0
Max. :4.00 Max. :450.0
NA's :48329 NA's :48329
aircraft_age lat lon alt time_hour wind_dir
Min. : 8.00 Min. :21.32 Min. :-157.92 Min. : 3.0 Min. :2013-01-01 10:00:00 Min. : 0.0
1st Qu.:15.00 1st Qu.:32.90 1st Qu.: -95.34 1st Qu.: 26.0 1st Qu.:2013-04-05 10:00:00 1st Qu.:130.0
Median :19.00 Median :36.08 Median : -83.99 Median : 433.0 Median :2013-07-04 13:00:00 Median :220.0
Mean :19.59 Mean :35.97 Mean : -89.61 Mean : 583.8 Mean :2013-07-03 21:56:45 Mean :201.9
3rd Qu.:22.00 3rd Qu.:41.41 3rd Qu.: -80.15 3rd Qu.: 748.0 3rd Qu.:2013-10-01 22:00:00 3rd Qu.:290.0
Max. :65.00 Max. :61.17 Max. : -68.83 Max. :6602.0 Max. :2014-01-01 04:00:00 Max. :360.0
NA's :53493 NA's :7537 NA's :7537 NA's :7537 NA's :9574
humid hour minute year type temp
Min. : 12.74 Min. : 5.00 Min. : 0.00 Min. :1956 Length:327346 Min. : 10.94
1st Qu.: 43.74 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:1999 Class :character 1st Qu.: 42.08
Median : 57.22 Median :13.00 Median :29.00 Median :2002 Mode :character Median : 57.20
Mean : 59.21 Mean :13.14 Mean :26.23 Mean :2001 Mean : 57.01
3rd Qu.: 74.67 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2006 3rd Qu.: 71.96
Max. :100.00 Max. :23.00 Max. :59.00 Max. :2013 Max. :100.04
NA's :1544 NA's :53493 NA's :1544
dewp wind_speed precip pressure visib
Min. :-9.94 Min. : 0.000 Min. :0.0000 Min. : 983.8 Min. : 0.00
1st Qu.:26.06 1st Qu.: 6.905 1st Qu.:0.0000 1st Qu.:1012.9 1st Qu.:10.00
Median :42.80 Median :10.357 Median :0.0000 Median :1017.6 Median :10.00
Mean :41.50 Mean :11.060 Mean :0.0042 Mean :1017.9 Mean : 9.29
3rd Qu.:57.92 3rd Qu.:14.960 3rd Qu.:0.0000 3rd Qu.:1022.9 3rd Qu.:10.00
Max. :78.08 Max. :42.579 Max. :1.2100 Max. :1042.1 Max. :10.00
NA's :1544 NA's :1605 NA's :1527 NA's :36142 NA's :1527
Which airport in our dataset has the highest number of departures?

What is the trend of delays over the year, how does it compare across the three airports?
flights_dt %>%
filter(dep_delay >= 15) %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = month, y = mean_delay, colour = origin) +
geom_line(alpha = 0.6) +
geom_point(alpha = 0.8) +
labs(
x = "month",
y = "mean departure delay (minutes)",
title = " Mean delay by month",
colour = "airport"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()

Which months have the highest and lowest mean departure delays?
# highest mean departure delay by month
flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
arrange(desc(mean_delay))
# lowest departure delay by month
flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
arrange(mean_delay)
What is the number of delayed flights over the year?
flights_dt %>%
filter(dep_delay >= 15) %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(no_of_delays = n(), .groups = "drop") %>%
ggplot() +
aes(x = month, y = no_of_delays, colour = origin) +
geom_line(alpha = 0.6) +
geom_point(alpha = 0.8) +
labs(
x = "month",
y = "number of delays",
title = "Number of delayed flights by month",
colour = "airport"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()

Which month has the highest number of delays?
flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(no_of_delays = n(), .groups = "drop") %>%
arrange(desc(no_of_delays))
What were the weather conditions during these months?
flights_dt %>%
mutate(date = as.Date(dep_time, label = TRUE),
month = month(dep_time, label = TRUE)) %>%
filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
select(date, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>%
group_by(date) %>%
summarise(
mean_pressure = mean(pressure, na.rm = TRUE),
mean_precip = mean(precip, na.rm = TRUE),
mean_temp = mean(temp, na.rm = TRUE),
mean_wind_speed = mean(wind_speed, na.rm = TRUE),
mean_wind_dir = mean(wind_dir, na.rm = TRUE),
mean_dewp = mean(dewp, na.rm = TRUE),
mean_visib = mean(visib, na.rm = TRUE),
mean_humid = mean(humid, na.rm = TRUE)
) %>%
pivot_longer(
cols = mean_pressure:mean_humid,
names_to = "variable",
values_to = "value"
) %>%
ggplot() +
aes(x = date, y = value, colour = variable) +
geom_line() +
scale_colour_manual(values = cbPalette) +
theme_bw()
Warning: Removed 8 row(s) containing missing values (geom_path).

flights_dt %>%
mutate(date = as.Date(dep_time, label = TRUE),
month = month(dep_time, label = TRUE)) %>%
filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
select(date, dep_delay, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>%
group_by(date) %>%
summarise(
departure_delay = mean(dep_delay),
pressure = mean(pressure),
precipitation = mean(precip),
temperature = mean(temp),
wind_speed = mean(wind_speed),
wind_direction = mean(wind_dir),
dewpoint = mean(dewp),
visibility = mean(visib),
humidity = mean(humid)
) %>%
na.omit() %>%
kable(align = "c") %>%
kable_styling(full_width = FALSE) %>%
column_spec(1:10, color = "black")
| date |
departure_delay |
pressure |
precipitation |
temperature |
wind_speed |
wind_direction |
dewpoint |
visibility |
humidity |
| 2013-12-02 |
60.08537 |
1013.267 |
0.0000000 |
46.32317 |
1.684068 |
11.58537 |
33.83073 |
7.402439 |
62.29524 |
| 2013-12-03 |
46.37931 |
1013.555 |
0.0000000 |
48.27759 |
5.297556 |
176.55172 |
33.99241 |
8.275862 |
59.37724 |
| 2013-12-04 |
58.64286 |
1019.479 |
0.0000000 |
47.24857 |
3.074227 |
39.14286 |
39.51371 |
6.314286 |
74.70871 |
| 2013-12-11 |
50.72414 |
1024.549 |
0.0000000 |
29.61655 |
12.248532 |
234.48276 |
16.42483 |
10.000000 |
57.65954 |
| 2013-12-12 |
41.20000 |
1024.931 |
0.0000000 |
24.20825 |
11.234490 |
251.25000 |
6.60200 |
10.000000 |
46.63287 |
| 2013-12-13 |
48.45902 |
1023.211 |
0.0000000 |
31.33311 |
10.772055 |
247.21311 |
14.74656 |
10.000000 |
49.98475 |
| 2013-12-16 |
43.13000 |
1020.163 |
0.0000000 |
27.44420 |
11.116535 |
279.50000 |
9.73220 |
9.950000 |
46.96260 |
| 2013-12-18 |
41.73529 |
1019.113 |
0.0000000 |
31.42294 |
9.950862 |
252.54902 |
19.94529 |
9.950980 |
62.86186 |
| 2013-12-19 |
53.08904 |
1019.290 |
0.0000000 |
38.14096 |
5.777546 |
199.93151 |
26.13644 |
9.739726 |
62.29753 |
| 2013-12-20 |
56.01198 |
1016.354 |
0.0000000 |
45.38790 |
7.194098 |
207.42515 |
32.40527 |
9.952096 |
60.93689 |
| 2013-12-21 |
56.14516 |
1014.031 |
0.0004032 |
57.21452 |
7.081009 |
195.16129 |
48.57452 |
9.943548 |
73.58613 |
| 2013-12-24 |
41.00000 |
1021.422 |
0.0000000 |
36.45655 |
12.599057 |
317.75862 |
20.25966 |
10.000000 |
52.09621 |
| 2013-12-27 |
49.68468 |
1026.695 |
0.0000000 |
37.00757 |
8.159134 |
248.01802 |
19.18919 |
10.000000 |
48.95180 |
| 2013-12-28 |
36.03896 |
1020.871 |
0.0000000 |
45.39247 |
12.688470 |
228.83117 |
25.24182 |
10.000000 |
47.41364 |
flights_dt %>%
mutate(date = as.Date(dep_time, label = TRUE),
month = month(dep_time, label = TRUE)) %>%
filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
select(date, dep_delay, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>%
group_by(date) %>%
summarise(
departure_delay = scale(dep_delay),
pressure = scale(pressure),
precipitation = scale(precip),
temperature = scale(temp),
wind_speed = scale(wind_speed),
wind_direction = scale(wind_dir),
dewpoint = scale(dewp),
visibility = scale(visib),
humidity = scale(humid)
) %>%
na.omit() %>%
pivot_longer(
cols = departure_delay:humidity,
names_to = "variable",
values_to = "value"
) %>%
ggplot() +
aes(x = date, y = value, colour = variable) +
geom_point(alpha = 0.8) +
scale_colour_manual(values = cbPalette) +
scale_x_date(labels = date_format("%d"), breaks = date_breaks("day")) +
labs(
colour = "",
title = "Weather variables December"
) +
theme_bw()
`summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

What is the number of flights per month?
flights_dt %>%
mutate(month = month(sched_dep_time, label = TRUE)) %>%
group_by(month, origin) %>%
summarise(no_of_flights = n(), .groups = "drop") %>%
ggplot() +
aes(x = month, y = no_of_flights, fill = origin) +
geom_col(position = "dodge", alpha = 0.8) +
labs(
x = "month",
y = "number of departures",
title = "Departure numbers by month",
fill = "airport"
) +
scale_fill_manual(values = cbPalette) +
theme_bw()

Which carrier has the most departure delays?
flights_dt %>%
filter(dep_delay >= 15) %>%
group_by(carrier_name, origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
geom_col(alpha = 0.8) +
facet_wrap(~ origin) +
labs(
x = "carrier",
y = "mean delay (minutes)",
title = "Mean departure delay by carrier",
fill = "airport"
) +
theme_bw() +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
coord_flip()

Which carrier has the most arrival delays?
flights_dt %>%
filter(arr_delay >= 15) %>%
group_by(carrier_name, origin) %>%
summarise(mean_delay = mean(arr_delay), .groups = "drop") %>%
ggplot() +
aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
geom_col(alpha = 0.8) +
facet_wrap(~ origin) +
labs(
x = "carrier",
y = "mean delay (minutes)",
title = "Mean arrival delay by carrier",
fill = "airport"
) +
theme_bw() +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
coord_flip()

Which season has the highest number of departure delays?
flights_dt %>%
filter(dep_delay >= 15) %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
aes(x = day, y = mean_delay, colour = origin) +
geom_point(alpha = 0.8) +
labs(
x = "date",
y = "mean delay (minutes)",
title = "Mean departure delay by month"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()

Over the course of a day when is the largest average delay?
flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
select(dep_delay, hour) %>%
group_by(hour) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
aes(x = hour, y = mean_delay) +
geom_point(alpha = 0.8, colour = "#56B4E9") +
geom_smooth(se = FALSE, colour = "#E69F00") +
labs(
x = "hour",
y = "mean delay (minutes)",
title = "Mean departure delay by departure time"
) +
theme_bw() +
scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

How does distance of an outbound flight affect departure delays?
flights_dt %>%
filter(origin == "EWR") %>%
ggplot() +
aes(x = distance) +
geom_histogram(bins = 50) +
theme_bw() +
labs(title = "Distribution of flight distance")

positions <- c("<500mi", "500-1000mi", "1000-1500mi", "1500-2000mi", "2000-2500mi", "2500-3000mi",
">3000mi")
flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
mutate(distance = case_when(
distance <= 500 ~ "<500mi",
distance > 500 & distance <= 1000 ~ "500-1000mi",
distance > 1000 & distance <= 1500 ~ "1000-1500mi",
distance > 1500 & distance <= 2000 ~ "1500-2000mi",
distance > 2000 & distance <= 2500 ~ "2000-2500mi",
distance > 2500 & distance <= 3000 ~ "2500-3000mi",
distance > 3000 ~ ">3000mi"
)) %>%
group_by(distance) %>%
summarise(mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = distance, y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "distance (miles)",
y = "mean departure delay (minutes)",
title = "Departure delay by flight distance"
) +
scale_x_discrete(limits = positions) +
theme_bw()

Which day is the best day to travel from Newark Int.?
flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
mutate(weekday = wday(sched_dep_time, label = TRUE)) %>%
group_by(weekday) %>%
summarise(mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = weekday, y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "weekday",
y = "mean departure delay (minutes)",
title = "Departure delay by weekday"
) +
theme_bw()

Which are the most popular destinations from Newark Int?
flights_dt %>%
filter(origin == "EWR") %>%
group_by(dest, dest_name) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot() +
aes(x = reorder(dest_name, count), y = count) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "destination",
y = "number of flights per year",
title = "Destinations from Newark Int."
) +
theme_bw() +
coord_flip()

Which destinations suffer from the longest delays?
flights_dt %>%
drop_na(dest_name) %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(dest, dest_name) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_delay)) %>%
head(10) %>%
ggplot() +
aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "destination",
y = "mean delay (minutes)",
title = "Destinations with longest delays"
) +
theme_bw() +
coord_flip()

Trend weather conditions against mean departure delays
flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_visibility = mean(visib, na.rm = TRUE),
mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = mean_visibility, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean visibility (miles)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by visibility"
) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_wind_speed, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean wind speed (mph)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by wind speed"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_wind_dir, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean wind direction (degrees)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by wind direction"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_humidity = mean(humid, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_humidity, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean humidity (%)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by humidity"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_temp = mean(temp, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_temp, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean temperature (degf)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by temperature"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_dewpoint, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean dewpoint (degF)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by dewpoint"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_precip = mean(precip, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_precip, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean precipitation (inches)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by precipitation"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay >= 15 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_pressure = mean(pressure, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_pressure, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean pressure (mbar)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by pressure"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

---
title: "Exploratory Analysis"
output: html_notebook
---

```{r, warning=FALSE,message=FALSE}
pacman::p_load(
  tidyverse,
  here,
  RColorBrewer,
  lubridate,
  scales,
  GGally,
  stats,
  corrplot,
  leaps,
  glmulti,
  broom,
  rpart,
  rpart.plot,
  modelr,
  yardstick,
  caret,
  ranger,
  kableExtra
)
```

```{r, warning=FALSE,message=FALSE}
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000")

summary(flights_dt)
```

# Which airport in our dataset has the highest number of departures?

```{r}
flights_dt %>% 
  group_by(origin) %>%
  ggplot() +
  aes(x = origin, fill = origin) +
  geom_bar(alpha = 0.8) +
  scale_fill_manual(values = cbPalette) +
  labs(
    x = "airport",
    y = "departure numbers",
    title = "Departure numbers by airport"
  ) +
  guides(fill = "none") +
  theme_bw()
```

# What is the trend of delays over the year, how does it compare across the three airports?

```{r,warning=FALSE}
flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = mean_delay, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "mean departure delay (minutes)",
    title = " Mean delay by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```
# Which months have the highest and lowest mean departure delays?

```{r}
# highest mean departure delay by month
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  arrange(desc(mean_delay))
```

```{r}
# lowest departure delay by month
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  arrange(mean_delay)
```

# What is the number of delayed flights over the year?

```{r}
flights_dt %>%
  filter(dep_delay >= 15) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_delays, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "number of delays",
    title = "Number of delayed flights by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```
# Which month has the highest number of delays?

```{r}
flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  arrange(desc(no_of_delays))
```

# What were the weather conditions during these months?

```{r}
flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    mean_pressure = mean(pressure, na.rm = TRUE),
    mean_precip = mean(precip, na.rm = TRUE),
    mean_temp = mean(temp, na.rm = TRUE),
    mean_wind_speed = mean(wind_speed, na.rm = TRUE),
    mean_wind_dir = mean(wind_dir, na.rm = TRUE),
    mean_dewp = mean(dewp, na.rm = TRUE),
    mean_visib = mean(visib, na.rm = TRUE),
    mean_humid = mean(humid, na.rm = TRUE)
  ) %>% 
  pivot_longer(
    cols = mean_pressure:mean_humid,
    names_to = "variable",
    values_to = "value"
  ) %>% 
  ggplot() +
  aes(x = date, y = value, colour = variable) +
  geom_line() +
  scale_colour_manual(values = cbPalette) +
  theme_bw()
```

```{r}
flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, dep_delay, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    departure_delay = mean(dep_delay),
    pressure = mean(pressure),
    precipitation = mean(precip),
    temperature = mean(temp),
    wind_speed = mean(wind_speed),
    wind_direction = mean(wind_dir),
    dewpoint = mean(dewp),
    visibility = mean(visib),
    humidity = mean(humid)
  ) %>% 
  na.omit() %>% 
  kable(align = "c") %>%
  kable_styling(full_width = FALSE) %>% 
  column_spec(1:10, color = "black")
```  

```{r}
flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, dep_delay, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    departure_delay = scale(dep_delay),
    pressure = scale(pressure),
    precipitation = scale(precip),
    temperature = scale(temp),
    wind_speed = scale(wind_speed),
    wind_direction = scale(wind_dir),
    dewpoint = scale(dewp),
    visibility = scale(visib),
    humidity = scale(humid)
  ) %>% 
  na.omit() %>% 
  pivot_longer(
    cols = departure_delay:humidity,
    names_to = "variable",
    values_to = "value"
  ) %>% 
  ggplot() +
  aes(x = date, y = value, colour = variable) +
  geom_point(alpha = 0.8) +
  scale_colour_manual(values = cbPalette) +
  scale_x_date(labels = date_format("%d"), breaks = date_breaks("day")) +
  labs(
    colour = "",
    title = "Weather variables December"
  ) +
  theme_bw()
```

# What is the number of flights per month?

```{r}
flights_dt %>%
  mutate(month = month(sched_dep_time, label = TRUE)) %>% 
  group_by(month, origin) %>% 
  summarise(no_of_flights = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_flights, fill = origin) +
  geom_col(position = "dodge", alpha = 0.8) +
  labs(
    x = "month",
    y = "number of departures",
    title = "Departure numbers by month",
    fill = "airport"
  ) +
  scale_fill_manual(values = cbPalette) +
  theme_bw()
```

# Which carrier has the most departure delays?

```{r}
flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean departure delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()
```

# Which carrier has the most arrival delays?

```{r}
flights_dt %>% 
  filter(arr_delay >= 15) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(arr_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean arrival delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()
```

# Which season has the highest number of departure delays?

```{r}
flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = day, y = mean_delay, colour = origin) +
  geom_point(alpha = 0.8) +
  labs(
    x = "date",
    y = "mean delay (minutes)",
    title = "Mean departure delay by month"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```

# Over the course of a day when is the largest average delay?

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  select(dep_delay, hour) %>%
  group_by(hour) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = hour, y = mean_delay) +
  geom_point(alpha = 0.8, colour = "#56B4E9") +
  geom_smooth(se = FALSE, colour = "#E69F00") +
  labs(
    x = "hour",
    y = "mean delay (minutes)",
    title = "Mean departure delay by departure time"
  ) +
  theme_bw() +
  scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
```
# How does distance of an outbound flight affect departure delays?

```{r}
flights_dt %>% 
  filter(origin == "EWR") %>% 
  ggplot() +
  aes(x = distance) +
  geom_histogram(bins = 50) +
  theme_bw() +
  labs(title = "Distribution of flight distance")
```

```{r}
positions <- c("<500mi", "500-1000mi", "1000-1500mi", "1500-2000mi", "2000-2500mi", "2500-3000mi",
               ">3000mi")

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  mutate(distance = case_when(
    distance <= 500 ~ "<500mi",
    distance > 500 & distance <= 1000 ~ "500-1000mi",
    distance > 1000 & distance <= 1500 ~ "1000-1500mi",
    distance > 1500 & distance <= 2000 ~ "1500-2000mi",
    distance > 2000 & distance <= 2500 ~ "2000-2500mi",
    distance > 2500 & distance <= 3000 ~ "2500-3000mi",
    distance > 3000 ~ ">3000mi"
  )) %>% 
  group_by(distance) %>% 
  summarise(mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = distance, y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "distance (miles)",
    y = "mean departure delay (minutes)",
    title = "Departure delay by flight distance"
  ) +
  scale_x_discrete(limits = positions) +
  theme_bw()
```

# Which day is the best day to travel from Newark Int.?

```{r}
flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  mutate(weekday = wday(sched_dep_time, label = TRUE)) %>% 
  group_by(weekday) %>% 
  summarise(mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = weekday, y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "weekday",
    y = "mean departure delay (minutes)",
    title = "Departure delay by weekday"
  ) +
  theme_bw()
```

# Which are the most popular destinations from Newark Int?

```{r}
flights_dt %>% 
  filter(origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(desc(count)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, count), y = count) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "number of flights per year",
    title = "Destinations from Newark Int."
  ) +
  theme_bw() +
  coord_flip()
```

# Which destinations suffer from the longest delays?

```{r}
flights_dt %>%
  drop_na(dest_name) %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  arrange(desc(mean_delay)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "mean delay (minutes)",
    title = "Destinations with longest delays"
  ) +
  theme_bw() +
  coord_flip()
```

# Trend weather conditions against mean departure delays

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_visibility = mean(visib, na.rm = TRUE),
            mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = mean_visibility, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean visibility (miles)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by visibility"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_speed, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean wind speed (mph)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind speed"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_dir, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean wind direction (degrees)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind direction"
  ) +
  theme_bw()
```

```{r}
flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_humidity = mean(humid, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_humidity, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean humidity (%)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by humidity"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_temp = mean(temp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_temp, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean temperature (degf)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by temperature"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_dewpoint, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean dewpoint (degF)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by dewpoint"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_precip = mean(precip, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_precip, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean precipitation (inches)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by precipitation"
  ) +
  theme_bw()
```

```{r}
flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_pressure = mean(pressure, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_pressure, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean pressure (mbar)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by pressure"
  ) +
  theme_bw()
```



